# AMAR ET AL. - COUNTERING MISINFORMATION EARLY (2025)
## REPLICATION FILE: 09_regressions_main.R
### This file reproduces all regression results for the main endline and follow-up surveys, including unadjusted, adjusted, and CACE models
# ----
# Functions ----
## Unadjusted ----
# function to tabulate regressions for each dv
tab_regression <- function(varname, design) {
  fml <- as.formula(str_c(varname, "~ treatment", "+ library_spillover_pre"))
  mod <- svyglm(fml, design = design)
  out <- tidy(mod) %>%
    mutate(dv = varname, n = nobs(mod), .before = 1)
  out
}

## Adjusted ----
tab_regression_adj <- function(varname, design) {
  fml <- as.formula(str_c(varname, "~treatment", "+ library_spillover_pre + age + class + gender + hindi_medium + science + mobile_internet + asset_index + caste + reading + village_nightlight_viirs_mean_2021 + bjp_coalition_vote_share  + religion_hindu_num")) # still have to add: village level controls (development/night lights; BJP vote share at assembly constituency level)
  # add - age, medium of education, religion, reading, BJP vote share at the assembly constituency level.... 
  # remove - school
  mod <- svyglm(fml, design = design)
  
  # Extract observed values and weights
  y <- model.response(model.frame(mod))
  w <- weights(mod)
  
  # Calculate the weighted mean of the dependent variable
  weighted_mean_y <- sum(w * y) / sum(w)
  
  # Calculate Total Sum of Squares (TSS) with weights
  total_ss <- sum(w * (y - weighted_mean_y)^2)
  
  residual_ss <- sum(weights(mod)*residuals(mod)^2)
  survey_r_squared <- round(1 - (residual_ss / total_ss),3)
  
  out <- tidy(mod) %>%
    mutate(dv = varname, n = nobs(mod), r_squared = survey_r_squared, .before = 1)
  out
}


## CACE ----
tab_regression_cace <- function(varname, design) {
  # Extract raw data from survey design
  df <- design$variables
  
  # Build formula
  fml_second_stage <- as.formula(paste(
    varname,
    "~ treatment_uptake_minimal + library_spillover_pre | treatment + library_spillover_pre"
  ))
  
  # Fit IV model
  mod_2sls <- ivreg(fml_second_stage, data = df)
  
  # Clustered VCOV at village level
  cl_vcov <- sandwich::vcovCL(mod_2sls, cluster = df$village)
  
  # Get clustered SEs
  coefs <- lmtest::coeftest(mod_2sls, vcov = cl_vcov)
  
  # Tidy results
  out <- broom::tidy(coefs) %>%
    mutate(dv = varname,
           n = nobs(mod_2sls),
           r_squared = round(summary(mod_2sls)$r.squared, 3),
           .before = 1)
  
  return(out)
}

# Run regressions ----
## Unadjusted ----
# Main endline outcomes
regressions <- lapply(dv_labs$dv, \(x) tab_regression(x, bimli_svy_dkr.rm)) %>% # make a list of regression tables
  bind_rows %>%
  mutate(margin = qnorm(0.975) * std.error,
         lower = estimate - margin,
         upper = estimate + margin) %>%
  filter(term != "(Intercept)" & !grepl("library_spillover_pre", term)) %>% # remove intercepts and district FEs
  select(-term)

regressions <- left_join(dv_labs, regressions, by = "dv")

# Follow-up survey outcomes
regressions_follow_up <- lapply(follow_up_dvs$dv, \(x) tab_regression(x, bimli_svy_dkr.rm)) %>% # make a list of regression tables
  bind_rows %>%
  mutate(margin = qnorm(0.975) * std.error,
         lower = estimate - margin,
         upper = estimate + margin) %>%
  filter(term != "(Intercept)" & !grepl("library_spillover_pre", term)) %>% # remove intercepts and district FEs
  select(-term)
regressions_follow_up <- left_join(follow_up_dvs, regressions_follow_up, by = "dv")

# Guardian outcomes
regressions_guardian <- lapply(guardian_dvs$dv, \(x) tab_regression(x, bimli_svy_dkr.rm)) %>% # make a list of regression tables
  bind_rows %>%
  mutate(margin = qnorm(0.975) * std.error,
         lower = estimate - margin,
         upper = estimate + margin) %>%
  filter(term != "(Intercept)" & !grepl("library_spillover_pre", term)) %>% # remove intercepts and district FEs
  select(-term)
regressions_guardian <- left_join(guardian_dvs, regressions_guardian, by = "dv")

## Adjusted ----
# Main endline outcomes
regressions_adj <-  lapply(dv_labs_endline_main$dv, \(x) tab_regression_adj(x, bimli_svy_dkr.rm)) %>% # make a list of regression tables
  bind_rows %>%
  mutate(margin = qnorm(0.975) * std.error,
         lower = estimate - margin,
         upper = estimate + margin) %>%
  filter(term != "(Intercept)" & !grepl("library_spillover_pre", term) )

regressions_adj <- left_join(dv_labs_endline_main, regressions_adj, by = "dv")

regressions_adj <- regressions_adj %>%
  filter(type == "idx")

# Follow up survey outcomes
regressions_follow_up_adj <- lapply(follow_up_dvs$dv, \(x) tab_regression_adj(x, bimli_svy_dkr.rm)) %>% # make a list of regression tables
  bind_rows %>%
  mutate(margin = qnorm(0.975) * std.error,
         lower = estimate - margin,
         upper = estimate + margin) %>%
  filter(term != "(Intercept)" & !grepl("library_spillover_pre", term))
regressions_follow_up_adj <- left_join(follow_up_dvs, regressions_follow_up_adj, by = "dv")

# Guardian outcomes
regressions_guardian_adj <- lapply(guardian_dvs$dv, \(x) tab_regression_adj(x, bimli_svy_dkr.rm)) %>% # make a list of regression tables
  bind_rows %>%
  mutate(margin = qnorm(0.975) * std.error,
         lower = estimate - margin,
         upper = estimate + margin) %>%
  filter(term != "(Intercept)" & !grepl("library_spillover_pre", term))
regressions_guardian_adj <- left_join(guardian_dvs, regressions_guardian_adj, by = "dv")

## CACE ----
# Main survey outcomes
regressions_cace <- lapply(dv_labs_endline_main$dv, \(x) tab_regression_cace(x, bimli_svy_dkr.rm)) %>% # make a list of regression tables
  bind_rows %>%
  mutate(margin = qnorm(0.975) * std.error,
         lower = estimate - margin,
         upper = estimate + margin) %>%
  filter(term != "(Intercept)" & !grepl("library_spillover_pre", term)) %>% # remove intercepts and district FEs
  select(-term)
regressions_cace <- left_join(dv_labs_endline_main, regressions_cace, by = "dv")
regressions_cace <- regressions_cace %>%
  filter(type == "idx")

# ---- 
# Main endline ----
## Create figures ----
### Unadjusted ----
regressions %>%
  filter(is.na(subidx) & secondary_outcome == FALSE & mechanism == FALSE) %>% # omit questions that are used in a subindex
  mutate(label = if_else(str_detect(label, "Index"), "Index", label),
         label = as_factor(label),
         idx = as_factor(idx),
         is.idx = if_else(label == "Index", TRUE, FALSE),
         significance = ifelse(p.value < 0.05, "YES", "NO")) %>%
  ggplot(aes(x = estimate, y = fct_rev(label), color = is.idx, linetype = significance)) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_point() +
  geom_errorbar(aes(xmin = lower, xmax = upper), width = 0) +
  facet_grid(rows = "idx", scales = "free_y", space = "free_y", switch = "y") + # facet by index
  xlab("Estimated effect of treatment in control-group SDs (Positive expected)") +
  scale_y_discrete(position = "right") +
  scale_color_manual(values = c("black", "red")) +
  scale_linetype_manual(values=c("dotted", "solid")) +
  theme_bw() %+replace%
  theme(axis.title.y = element_blank(),
        strip.text.y.left = element_text(angle = 0),
        legend.position = "none")
ggsave("output/figures/effects.pdf", height = 5, width = 8)

### Adjusted ----
regressions_adj %>%
  filter(term == "treatmentMedia Literacy") %>%
  filter(is.na(subidx) & secondary_outcome == FALSE & mechanism == FALSE) %>% # omit questions that are used in a subindex
  mutate(label = if_else(str_detect(label, "Index"), "Index", label),
         label = as_factor(label),
         idx = as_factor(idx),
         is.idx = if_else(label == "Index", TRUE, FALSE),
         significance = ifelse(p.value < 0.05, "YES", "NO")) %>%
  ggplot(aes(x = estimate, y = fct_rev(label), color = is.idx, linetype = significance)) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_point() +
  geom_errorbar(aes(xmin = lower, xmax = upper), width = 0) +
  facet_grid(rows = "idx", scales = "free_y", space = "free_y", switch = "y") + # facet by index
  xlab("Estimated effect of treatment in control-group SDs (Positive expected)") +
  scale_y_discrete(position = "right") +
  scale_color_manual(values = c("black", "red")) +
  scale_linetype_manual(values=c("dotted", "solid")) +
  theme_bw() %+replace%
  theme(axis.title.y = element_blank(),
        strip.text.y.left = element_text(angle = 0),
        legend.position = "none")
ggsave("output/figures/effects_adj.pdf", height = 5, width = 8)

### ITT vs. CACE ----
regressions_itt_vs_cace <- 
  regressions %>%
  mutate(
    effect_type = "ITT") %>%
  rbind(
    regressions_cace %>% mutate(effect_type = "CACE") %>% select(-r_squared)
  ) %>%
  filter(is.na(subidx) & secondary_outcome == FALSE & mechanism == FALSE & grepl("Index", label))

regressions_itt_vs_cace %>%
  mutate(
    label = if_else(str_detect(label, "Index"), "Index", label),
    label = as_factor(label),
    idx = as_factor(idx),
    is.idx = if_else(label == "Index", TRUE, FALSE),
    significance = ifelse(p.value < 0.05, "YES", "NO")
  ) %>%
  ggplot(aes(x = estimate, y = fct_rev(effect_type), color = fct_rev(effect_type), linetype = significance)) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_point() +
  geom_errorbar(aes(xmin = lower, xmax = upper), width = 0) +
  facet_grid(rows = "idx", scales = "free_y", space = "free_y", switch = "y") + # facet by index
  xlab("Estimated effect") +
  scale_y_discrete(position = "right") +
  scale_color_manual(values = c("#2066a8", "#ea801c")) +
  scale_linetype_manual(values = c("dashed", "solid")) +
  labs(color = "Estimand") +  # Set title for effect_type legend
  guides(linetype = "none") +  # Remove linetype legend
  xlim(-0.1, 0.4) +
  theme_bw() %+replace%
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    strip.text.y.left = element_text(angle = 0),
    legend.position = "bottom"
  )
ggsave("output/figures/effects_cace_vs_itt.pdf", height = 5, width = 8)

## Create tables ----
### Unadjusted ----
# Compute control group statistics for each dependent variable
tab_control_stats <- function(varname, data) {
  # Subset only control group (treatment == 0)
  data_sub <- data[data$treatment == "Spoken English", ]
  
  # Compute mean and standard deviation with NA handling
  mean_val <- round(mean(data_sub[[varname]], na.rm = TRUE),2)
  sd_val <- round(sd(data_sub[[varname]], na.rm = TRUE),2)
  
  # Return as a one-row tibble
  tibble(
    dv = varname,
    control_mean = mean_val,
    control_sd = sd_val,
  )
}

control_stats <- lapply(dv_labs$dv,\(x) tab_control_stats(x, bimli)) %>% 
  bind_rows

# Create tables
regressions_table <- dv_labs %>%
  left_join(regressions, by = "dv") %>%
  left_join(control_stats, by = "dv") %>%
  rename(
    idx = idx.x,
    label = label.x,
    type = type.x ,
    secondary_outcome = secondary_outcome.x
  ) %>%
  select(-ends_with(".y"))

latex_regression <- function(idx_name) {
  df <- regressions_table %>%
    filter(idx == idx_name & secondary_outcome == FALSE) %>%
    select(label, type, n, estimate, std.error, p.value, control_mean, control_sd) %>%
    mutate(
      label = if_else(str_detect(label, "Index"), "Index", label),
      type = case_match(
        type,
        "idx" ~ "---",
        "subidx" ~ "Sub-index",
        "question" ~ "Question"
      ),
      n = format(n, big.mark = ","),
      sig.stars = case_when(
        p.value < 0.001 ~ "***",
        p.value < 0.01 ~ "**",
        p.value < 0.05 ~ "*",
        TRUE ~ ""
      ),
      estimate = str_c(format(round(estimate, 2), nsmall = 2), sig.stars),
      std.error = round(std.error, 2),
      p.value = case_when(
        p.value > 0.9 ~ "$>$0.9",
        p.value < 0.001 ~ "$<$0.001",
        p.value < 0.01 ~ as.character(round(p.value, 3)),
        TRUE ~ as.character(round(p.value, 3))
      )
    ) %>%
    select(-sig.stars) %>%
    rename(
      Outcome = label,
      Type = type,
      N = n,
      Estimate = estimate,
      SE = std.error,
      "p-value" = p.value,
      "Ctrl. Mean" = control_mean,
      "Ctrl. SD" = control_sd
    )
  
  # Check if dataframe is empty
  if (nrow(df) == 0) {
    cat("No data found for index:", idx_name, "\n")
    return()
  }
  
  xt <- xtable::xtable(
    df,
    caption = idx_name,
    align = "lllllllll",
    digits = 2,
    label = paste0("tab:", idx_name)
  )
  
  # Create the note text
  # Create the note text with parbox for proper wrapping
  note_text <- paste0("\\hline\\\\[-0.9em]\n\\multicolumn{8}{l}{\\begin{minipage}{1.2\\textwidth}\\small Note: This table reports treatment effects on ", 
                      tolower(idx_name), 
                      ", including the index as well as sub-index items. ICW was used to construct indices. Estimated using OLS regressions with library-spillover strata fixed effects and standard-errors clustered at the classroom level. All indices are standardized with respect to the control group mean and standard deviation. *p$<$0.05; **p$<$0.01; ***p$<$0.001.\\end{minipage}} \\\\\n")
  
  # Output file path
  tex_file <- paste0("output/tables/tab_unadjusted_", idx_name, ".tex")
  # Capture output in a string
  temp_conn <- textConnection("temp_tex", "w", local = TRUE)
  print(
    xt,
    file = temp_conn,
    include.rownames = FALSE,
    caption.placement = "top",
    sanitize.text.function = identity,
    sanitize.colnames.function = \(x) str_c("\\textbf{", x, "}"),
    add.to.row = list(
      pos = list(nrow(df)),
      command = note_text
    ),
    hline.after = c(-1, 0),
    comment = FALSE,
    table.placement = getOption("xtable.table.placement", "h!")
  )
  close(temp_conn)
  
  # Rest of your code remains the same...
  temp_tex <- gsub(
    pattern = "(\\\\begin\\{tabular\\}\\{[l|]+\\})",
    replacement = "\\\\scalebox{0.8}{\n\\1",
    x = temp_tex
  )
  temp_tex <- gsub(
    pattern = "(\\\\end\\{tabular\\})",
    replacement = "\\1\n}",
    x = temp_tex
  )
  writeLines(temp_tex, tex_file)
}

# Create latex table for each index
for (idx_name in unique(regressions$idx)) {
  latex_regression(idx_name)
  cat("\n")
}

### Adjusted ----
model_labels <- unique(regressions_adj$label)
model_labels <- model_labels[model_labels != "Civic Participation Index"]

# Relocate "Awareness Index" to the end
if("Awareness Index" %in% model_labels) {
  # Find the position of "Awareness Index"
  awareness_idx <- which(model_labels == "Awareness Index")
  # Remove "Awareness Index" from its original position
  model_labels <- model_labels[-awareness_idx]
  # Add it back at the end
  model_labels <- c(model_labels, "Awareness Index")
}



n_values <- sapply(model_labels, function(lbl) {
  # Get the first N value for each model (assuming it's consistent within models)
  n_val <- regressions_adj %>% 
    filter(label == lbl) %>% 
    pull(n) %>% 
    first()
  return(n_val)
})

# Extract R-squared values for each model
r2_values <- sapply(model_labels, function(lbl) {
  # Get the R-squared value for each model
  r2_val <- regressions_adj %>% 
    filter(label == lbl) %>% 
    pull(r_squared) %>% 
    first()
  return(r2_val)
})

# Format R² values to match your current precision
r2_formatted <- sprintf("%.3f", r2_values)

# Create the dynamic string for N and R² rows
n_row <- paste("N &", paste(n_values, collapse = " & "), "\\\\ \n")
r2_row <- paste("R2 &", paste(r2_formatted, collapse = " & "), "\\\\ \n")


regressions_adj_tab <- regressions_adj %>%
  select(label, term, estimate, std.error,p.value) %>%
  arrange(term)

# Filter out Civic Participation Index and pivot wider
regressions_adj_tab <- regressions_adj_tab %>%
  filter(label != "Civic Participation Index") %>%
  pivot_wider(
    names_from = label,
    values_from = c(estimate, std.error, p.value)
  )

# Pivot longer to create separate rows for estimate and p.value, keeping numeric types
regressions_adj_tab <- regressions_adj_tab %>%
  pivot_longer(
    cols = starts_with("estimate_") | starts_with("std.error_") | starts_with("p.value_"),
    names_to = c("stat", "label"),
    names_sep = "_",
    values_to = "value",
    values_transform = list(value = as.character)  # Convert to character to handle stars and numeric together
  )


p_values_adj <- regressions_adj_tab %>%
  filter(stat == "p.value") %>%
  select(label, term, p.value = value)  # Rename `value` to `p.value` for clarity

regressions_adj_tab <- regressions_adj_tab %>%
  left_join(p_values_adj, by = c("label", "term"))

regressions_adj_tab <- regressions_adj_tab %>%
  mutate(
    # Round and add significance stars based on `p.value`
    value = case_when(
      stat == "estimate" ~ paste0(sprintf("%.3f", as.numeric(value)),
                                  case_when(
                                    as.numeric(p.value) < 0.001 ~ "***",
                                    as.numeric(p.value) < 0.01 ~ "**",
                                    as.numeric(p.value) < 0.05 ~ "*",
                                    TRUE ~ ""
                                  )
      ),
      stat == "std.error" ~ sprintf("%.4f", as.numeric(value)),
      stat == "p.value" ~ sprintf("%.4f", as.numeric(value))
    )
  ) %>%
  select(-p.value)  %>%
  # Pivot to make `estimate`, `std.error`, and `p.value` columns wide
  pivot_wider(
    names_from = label,
    values_from = value
  ) %>%
  # Filter out rows where `stat` is `p.value`
  filter(stat != "p.value")  %>%
  # Correct way to drop the `p.value` column
  mutate(
    # Format std.error with parentheses and ensure estimate retains significance stars
    across(starts_with("Awareness Index"):starts_with("Engagement Behavior Index"),
           ~ ifelse(stat == "estimate", ., paste0("(", ., ")"))
    )
  ) %>%
  # Arrange terms in specified order
  arrange(
    match(term, c(
      "treatmentMedia Literacy", "age", "genderGirl", "class", "casteOBC/EBC", "casteSC", "casteST", "religion_hindu_num",
      "asset_index", "hindi_medium1", "reading", "science", "mobile_internetNo", "bjp_coalition_vote_share",  "village_nightlight_viirs_mean_2021"
    )),
    stat
  ) %>%
  mutate(
    # Remove `term` for p.value rows and rename variables
    term = ifelse(stat == "std.error", "", term)
  ) %>%
  select(-stat) %>%
  rename(Outcome = term) %>%
  mutate(Outcome = case_when(
    Outcome == "age" ~ "Age",
    Outcome == "asset_index" ~ "Asset Index",
    Outcome == "bjp_coalition_vote_share" ~ "BJP+ Vote Share",    
    Outcome == "casteOBC/EBC" ~ "OBC",   
    Outcome == "casteSC" ~ "SC",   
    Outcome == "casteST" ~ "ST",
    Outcome == "hindi_medium1" ~ "Hindi medium",
    Outcome == "mobile_internetNo" ~ "Mobile Internet",
    Outcome == "reading" ~ "Reading skill index",
    Outcome == "science" ~ "Science skill index",
    Outcome == "genderGirl" ~ "Girl",
    Outcome == "religion_hindu_num" ~ "Hindu",
    Outcome == "treatmentMedia Literacy" ~ "Treatment",
    Outcome == "class" ~ "Grade",
    Outcome == "village_nightlight_viirs_mean_2021" ~ "Village Nightlights",
    TRUE ~ Outcome  # Keep other values unchanged
  )) %>%
  rename_with(~ gsub("Index$", "", .), ends_with("Index"))


regressions_adj_tab <- regressions_adj_tab %>%
  relocate("Awareness ", .after = last_col())

regressions_adj_tab <- regressions_adj_tab %>%
  select(Outcome, everything())

# Ensure no row names in final_data
rownames(regressions_adj_tab) <- NULL

# Remove existing column names from final_data to avoid default headers in LaTeX output
colnames(regressions_adj_tab) <- NULL

# Generate LaTeX table with xtable
latex_table_adj <- xtable(regressions_adj_tab, 
                          caption = "Main results (adjusted model)", 
                          label = "tab:adj_main_estimates_terms")


# Manually specify header rows for two-line headers
header_row1_adj <- c("Outcome",  "Accuracy", "Sharing", "Health", "Source", "Engagement", "Engagement", "Awareness")
header_row2_adj <- c("",  "Discernment", "Discernment", "Preferences", "Discernment", "Attitude", "Behavior", "")


# Create the LaTeX code for the two-line headers with a single \hline after "Behavior" and no extra lines
latex_header_adj <- paste(
  paste(header_row1_adj, collapse = " & "), "\\\\",
  paste(header_row2_adj, collapse = " & "), "\\\\"
)

# Calculate the number of columns and rows as needed
num_cols_adj <- ncol(regressions_adj_tab) - 1  # Adjust for the alignment
num_rows_adj <- nrow(latex_table_adj)     # Ensure numeric row count

# Footer
command2 <- paste0(
  "\\hline\n", 
  n_row, 
  r2_row, 
  "Library-Spillover FEs&  Yes &  Yes &  Yes &  Yes &  Yes &  Yes &  Yes \\\\ \n",
  "\\hline\n",
  "\\multicolumn{", num_cols_adj + 1, "}{l}{*p$<$0.05; **p$<$0.01; ***p$<$0.001. SEs clustered at the village level in parentheses.} \\\\ \n"
)

# Capture the LaTeX output as a character vector without row names
latex_table_adj <- capture.output(print(latex_table_adj, type = "latex", include.rownames = FALSE, 
                                        sanitize.text.function = identity, 
                                        align = c("l", "l", rep("c", num_cols_adj)),  # Use calculated number of columns
                                        add.to.row = list(
                                          pos = list(0, num_rows_adj),  # Use calculated row count
                                          command = c(
                                            latex_header_adj,command2
                                          )
                                        )
))

# Find the index of `\centering` and insert `\small` right after it
centering_index_adj <- which(grepl("\\centering", 
                                   latex_table_adj))
latex_output_adj <- append(latex_table_adj, 
                           "\\resizebox{\\linewidth}{!}{%", 
                           after = centering_index_adj)

# Replace `\end{tabular}` with `%\n}`
latex_output_adj <- sub("\\\\end\\{tabular\\}", 
                        "\\\\end\\{tabular\\}%\n}", 
                        latex_output_adj)

# Write the captured output to the file
writeLines(latex_output_adj, "output/tables/tab_adjusted_main_results.tex")

### CACE ----
model_labels <- unique(regressions_cace$label)
model_labels <- model_labels[model_labels != "Civic Participation Index"]

# Relocate "Awareness Index" to the end
if("Awareness Index" %in% model_labels) {
  # Find the position of "Awareness Index"
  awareness_idx <- which(model_labels == "Awareness Index")
  # Remove "Awareness Index" from its original position
  model_labels <- model_labels[-awareness_idx]
  # Add it back at the end
  model_labels <- c(model_labels, "Awareness Index")
}



n_values <- sapply(model_labels, function(lbl) {
  # Get the first N value for each model (assuming it's consistent within models)
  n_val <- regressions_cace %>% 
    filter(label == lbl) %>% 
    pull(n) %>% 
    first()
  return(n_val)
})

# Extract R-squared values for each model
r2_values <- sapply(model_labels, function(lbl) {
  # Get the R-squared value for each model
  r2_val <- regressions_cace %>% 
    filter(label == lbl) %>% 
    pull(r_squared) %>% 
    first()
  return(r2_val)
})

# Format R² values to match your current precision
r2_formatted <- sprintf("%.3f", r2_values)

# Create the dynamic string for N and R² rows
n_row <- paste("N &", paste(n_values, collapse = " & "), "\\\\ \n")
r2_row <- paste("R2 &", paste(r2_formatted, collapse = " & "), "\\\\ \n")


table_data_cace <- regressions_cace %>%
  mutate(
    term="treatmentMedia Literacy" 
  ) %>%
  select(label, term, estimate, std.error,p.value) %>%
  arrange(term)


table_data_cace <- table_data_cace %>%
  filter(label != "Civic Participation Index") %>%
  pivot_wider(
    names_from = label,
    values_from = c(estimate, std.error, p.value)
  )

table_data_cace <- table_data_cace %>%
  pivot_longer(
    cols = starts_with("estimate_") | starts_with("std.error_") | starts_with("p.value_"),
    names_to = c("stat", "label"),
    names_sep = "_",
    values_to = "value",
    values_transform = list(value = as.character)  # Convert to character to handle stars and numeric together
  )


p_values_cace <- table_data_cace %>%
  filter(stat == "p.value") %>%
  select(label, term, p.value = value)  # Rename `value` to `p.value` for clarity

table_data_cace <- table_data_cace %>%
  left_join(p_values_cace, by = c("label", "term"))

table_data_cace <- table_data_cace %>%
  mutate(
    # Round and add significance stars based on `p.value`
    value = case_when(
      stat == "estimate" ~ paste0(sprintf("%.3f", as.numeric(value)),
                                  case_when(
                                    as.numeric(p.value) < 0.001 ~ "***",
                                    as.numeric(p.value) < 0.01 ~ "**",
                                    as.numeric(p.value) < 0.05 ~ "*",
                                    TRUE ~ ""
                                  )
      ),
      stat == "std.error" ~ sprintf("%.4f", as.numeric(value)),
      stat == "p.value" ~ sprintf("%.4f", as.numeric(value))
    )
  ) %>%
  select(-p.value)  %>%
  # Pivot to make `estimate`, `std.error`, and `p.value` columns wide
  pivot_wider(
    names_from = label,
    values_from = value
  ) %>%
  # Filter out rows where `stat` is `p.value`
  filter(stat != "p.value")  %>%
  # Correct way to drop the `p.value` column
  mutate(
    # Format std.error with parentheses and ensure estimate retains significance stars
    across(starts_with("Awareness Index"):starts_with("Engagement Behavior Index"),
           ~ ifelse(stat == "estimate", ., paste0("(", ., ")"))
    )
  ) %>%
  # Arrange terms in specified order
  arrange(
    match(term, c(
      "treatmentMedia Literacy", "spillover"
    )),
    stat
  ) %>%
  mutate(
    # Remove `term` for p.value rows and rename variables
    term = ifelse(stat == "std.error", "", term)
  ) %>%
  select(-stat) %>%
  rename(Outcome = term) %>%
  mutate(Outcome = case_when(
    Outcome == "treatmentMedia Literacy" ~ "Treatment",
    Outcome == "spillover" ~ "Indicator for Strata",
    TRUE ~ Outcome  # Keep other values unchanged
  )) %>%
  rename_with(~ gsub("Index$", "", .), ends_with("Index"))


table_data_cace <- table_data_cace %>%
  relocate("Awareness ", .after = last_col())

table_data_cace <- table_data_cace %>%
  select(Outcome, everything())

# Ensure no row names in final_data
rownames(table_data_cace) <- NULL

# Remove existing column names from final_data to avoid default headers in LaTeX output
colnames(table_data_cace) <- NULL


# Generate LaTeX table with xtable
latex_table_cace <- xtable(table_data_cace, 
                           caption = "Complier Average Causal Effects (CACE)", 
                           label = "tab:cace_main_estimates_terms")

# Manually specify header rows for two-line headers
header_row1_cace <- c("Outcome",  "Accuracy", "Sharing", "Health", "Source", "Engagement", "Engagement", "Awareness")
header_row2_cace <- c("",  "Discernment", "Discernment", "Preferences", "Discernment", "Attitude", "Behavior", "")


# Create the LaTeX code for the two-line headers with a single \hline after "Behavior" and no extra lines
header_latex_cace <- paste(
  paste(header_row1_cace, collapse = " & "), "\\\\",
  paste(header_row2_cace, collapse = " & "), "\\\\"
)

# Calculate the number of columns and rows as needed
num_cols_cace <- ncol(table_data_cace) - 1  # Adjust for the alignment
num_rows_cace <- nrow(latex_table_cace)     # Ensure numeric row count

# Footer
command2 <- paste0(
  "\\hline\n", 
  n_row, 
  r2_row, 
  "Library-Spillover FEs&  Yes &  Yes &  Yes &  Yes &  Yes &  Yes &  Yes \\\\ \n",
  "\\hline\n",
  "\\multicolumn{", num_cols_cace + 1, "}{l}{*p$<$0.05; **p$<$0.01; ***p$<$0.001. SEs clustered at the village level in parentheses.} \\\\ \n"
)

# Capture the LaTeX output as a character vector without row names
latex_output_cace <- capture.output(print(latex_table_cace, type = "latex", include.rownames = FALSE, 
                                          sanitize.text.function = identity, 
                                          align = c("l", "l", rep("c", num_cols_cace)),  # Use calculated number of columns
                                          add.to.row = list(
                                            pos = list(0, num_rows_cace),  # Use calculated row count
                                            command = c(
                                              header_latex_cace, command2)
                                          )
))

# Find the index of `\centering` and insert `\small` right after it
centering_index_cace <- which(grepl("\\centering", latex_output_cace))
latex_output_cace <- append(latex_output_cace, 
                            "\\resizebox{\\linewidth}{!}{%", 
                            after = centering_index_cace)

# Replace `\end{tabular}` with `%\n}`
latex_output_cace <- sub("\\\\end\\{tabular\\}", 
                         "\\\\end\\{tabular\\}%\n}", 
                         latex_output_cace)

# Write the captured output to the file
writeLines(latex_output_cace, "output/tables/tab_cace_main_results.tex")


### Treatment effect in % of control-group means ----
effects_in_perc_df <- bimli %>%
  mutate(
    engagement_attitude1 = engagement_attitude1a_raw + engagement_attitude1b_raw
  ) %>%
  select(treatment, 
         # Awareness
         news_manipulation1, news_manipulation2, # 1-4
         bias2, bias3, # 1-4
         # Accuracy discernment
         discernment7:discernment10, # 1-4
         discernment1:discernment5,# 1-4
         # Sharing discernment
         sharing7:sharing10, # 1-3
         sharing1:sharing5, # 1-3
         # Source discernment
         source_discern_generic1:source_discern_generic2, source_discern_generic5, # 1-4 (good)
         source_discern_generic3:source_discern_generic4, source_discern_generic6, # 1-4 (bad)
         source_discern_specific1:source_discern_specific3, # 1-4 (good)
         source_discern_specific4:source_discern_specific6, # 1-4 (bad)
         cues1:cues2, # 1-3 (good)
         cues3:cues4, # 1-3 (bad)
         # Health
         vaccine_safety1, # 1-3
         vaccine_safety2, # 1-3
         illness_response1, # 1-3
         illness_response2, # 1-3
         ayurveda, # 1-3
         # Engagement Attitude
         engagement_attitude1,
         engagement_attitude2_raw, # 1-3
         engagement_attitude3_raw, # 1-5
         # Engagement Behavior
         engagement_behavior1_raw, # 1-2 
         engagement_behavior2_raw # 1-2
  ) %>%
  mutate(across(
    -treatment, 
    ~ .x / abs((max(.x, na.rm = TRUE) - min(.x, na.rm = TRUE)))
  )) %>%
  mutate(
    # Accuracy discernment
    accuracy_discernment = rowSums(select(., discernment7:discernment10, discernment1:discernment5),
                                   na.rm = TRUE),
    # Sharing discernment
    sharing_discernment = rowSums(select(., sharing7:sharing10, sharing1:sharing5),
                                  na.rm = TRUE),
    # Source discernment
    source_discernment = rowSums(select(., 
                                        source_discern_generic1:source_discern_generic2, source_discern_generic5,
                                        source_discern_generic3:source_discern_generic4, source_discern_generic6,
                                        source_discern_specific1:source_discern_specific3,
                                        source_discern_specific4:source_discern_specific6,
                                        cues1:cues4),
                                 na.rm = TRUE),
    # Health
    health = rowSums(select(., vaccine_safety1, vaccine_safety2,
                            illness_response1, illness_response2, ayurveda),
                     na.rm = TRUE),
    # Engagement attitude
    engagement_attitude = rowSums(select(., engagement_attitude1,
                                         engagement_attitude2_raw, engagement_attitude3_raw),
                                  na.rm = TRUE),
    # Engagement behavior
    engagement_behavior = rowSums(select(., engagement_behavior1_raw, engagement_behavior2_raw),
                                  na.rm = TRUE),
    # Awareness
    awareness = rowSums(select(., news_manipulation1:news_manipulation2, bias2:bias3),
                        na.rm = TRUE)
  ) %>%
  select(treatment, accuracy_discernment:awareness) %>%
  # Calculate mean values for each index within each treatment group.
  group_by(treatment) %>%
  summarise(across(everything(), ~ mean(.x, na.rm = TRUE))) %>%
  ungroup() %>%
  # Reshape to have one row per index (variable) and separate columns for each treatment group.
  pivot_longer(cols = -treatment, names_to = "Variable", values_to = "Mean") %>%
  pivot_wider(names_from = treatment, values_from = Mean) %>%
  rename(
    Control = `Spoken English`,
    Treatment = `Media Literacy`
  ) %>%
  mutate(
    `% Treatment Effect` = round(((Treatment - Control) / abs(Control)) * 100, 2)) %>%
  rename(
    Outcome = Variable,
    `Control Mean` = Control,
    `Treatment Mean` = Treatment,
    `% Treatment Effect` = `% Treatment Effect`
  ) %>%
  mutate(
    Outcome = case_when(
      Outcome == "accuracy_discernment" ~ "Accuracy Discernment",
      Outcome == "sharing_discernment" ~ "Sharing Discernment",
      Outcome == "source_discernment" ~ "Source Discernment",
      Outcome == "health" ~ "Health Preferences",
      Outcome == "engagement_attitude" ~ "Engagement Attitude",
      Outcome == "engagement_behavior" ~ "Engagement Behavior",
      Outcome == "awareness" ~ "Awareness"
    )
  ) %>%
  mutate(
    `Control Mean` = round(`Control Mean`, 2),
    `Treatment Mean` = round(`Treatment Mean`, 2)
  )

effects_in_perc_df %>%
  xtable::xtable(caption = "Effects in \\% of Control Group Means", align = "rlrrr",
                 label = "tab:effects_in_perc") %>%
  
  print(include.rownames = FALSE, 
        caption.placement = "top",
        type= 'latex',
        sanitize.text.function = identity, # add backslashes to special characters
        sanitize.colnames.function = \(x) str_c("\\textbf{", gsub("%", "\\\\%", x), "}"),
        add.to.row = list(
          pos = list(nrow(.)), 
          command = c("\\hline\n\\multicolumn{4}{p{0.85\\textwidth}}{\\footnotesize The treatment effect is expressed as a percentage change relative to the control group mean. For each index, we first rescale all contributing items to range from 0 to 1 based on the observed minimum and maximum in the data. Composite indices are then computed by summing the relevant rescaled items. The percentage effect is calculated as: $\\frac{\\text{Treatment Mean} - \\text{Control Mean}}{|\\text{Control Mean}|}* 100$.} \\\\\n")),
        hline.after = c(-1, 0), # Add horizontal lines before and after headers
        comment = F,
        table.placement = getOption("xtable.table.placement", "h!"),
        file = paste0("output/tables/effects_in_perc", ".tex"))

# -----
# Follow-up survey ----
## Students ----
### Unadjusted ----
#### Create figure ----
regressions_follow_up %>%
  filter(is.na(subidx) & secondary_outcome == FALSE & mechanism == FALSE) %>% # omit questions that are used in a subindex
  mutate(label = if_else(str_detect(label, "Index"), "Index", label),
         label = as_factor(label),
         idx = as_factor(idx),
         is.idx = if_else(label == "Index", TRUE, FALSE),
         significance = ifelse(p.value < 0.05, "YES", "NO")) %>%
  ggplot(aes(x = estimate, y = fct_rev(label), color = is.idx, linetype = significance)) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_point() +
  geom_errorbar(aes(xmin = lower, xmax = upper), width = 0) +
  facet_grid(rows = "idx", scales = "free_y", space = "free_y", switch = "y") + # facet by index
  xlab("Estimated effect of treatment in control-group SDs (Positive expected)") +
  scale_y_discrete(position = "right") +
  scale_color_manual(values = c("black", "red")) +
  scale_linetype_manual(values=c("dotted", "solid")) +
  theme_bw() %+replace%
  theme(axis.title.y = element_blank(),
        strip.text.y.left = element_text(angle = 0),
        legend.position = "none")
ggsave("output/figures/effects_follow_up.pdf", height = 3, width = 8)


#### Create table ----
regressions_follow_up$label[regressions_follow_up$dv == "follow_accuracy_discernment"] <- "Accuracy Discernment"

regressions_follow_up$label[regressions_follow_up$dv == "follow_accuracy_discernment_political"] <- "Political Discernment "

regressions_follow_up$label[regressions_follow_up$dv == "follow_source_discernment"] <- "Source Discernment"

regressions_follow_up <- regressions_follow_up %>%
  arrange(
    factor(dv, levels = c(
      "follow_accuracy_discernment",
      "follow_accuracy_true",
      "follow_accuracy_false",
      "follow_accuracy_discernment_political",
      "follow_discernment_political_true",
      "follow_discernment_political_false",
      "follow_source_discernment",
      "follow_source_discern_specific_good",
      "follow_source_discern_specific_bad"
    ))
  )

regressions_follow_up %>%
  select(label, type, n, estimate, std.error, p.value) %>%
  mutate(type = case_match(type,
                           "idx" ~ "Index", 
                           "subidx" ~ "Sub-index"),
         n = format(n, big.mark = ","),
         sig.stars = case_when(p.value < 0.001 ~ "***",
                               p.value < 0.01 ~ "**",
                               p.value < 0.05 ~ "*",
                               TRUE ~ ""),
         estimate = str_c(format(round(estimate, 2), nsmall = 2), sig.stars), # add significance stars to estimate
         std.error = round(std.error, 3),
         p.value = case_when(p.value > 0.9 ~ "$>$0.9",
                             p.value < 0.001 ~ "$<$0.001",
                             p.value < 0.01 ~ as.character(round(p.value, 3)),
                             TRUE ~ as.character(round(p.value, 2)))) %>%
  select(-sig.stars) %>%
  rename(Outcome = label,
         Type = type,
         N = n,
         Estimate = estimate,
         SE = std.error,
         "p-value" = p.value) %>%
  xtable::xtable(caption = "Effect of assignment to BIMLI treatment on 4-month follow-up", align = "lllllll", digits = 3,
                 label = paste0("tab:", "unadjusted_follow_up")) %>%
  print(include.rownames = FALSE, 
        caption.placement = "top",
        sanitize.text.function = identity, # add backslashes to special characters
        sanitize.colnames.function = \(x) str_c("\\textbf{", x, "}"),
        add.to.row = list(pos = list(nrow(.)), 
                          command = paste(
                            "\\hline\n\\multicolumn{6}{l}{\\footnotesize *p$<$0.05; **p$<$0.01; ***p$<$0.001. Models include library-spillover strata FEs.} \\\\\n")),
        hline.after = c(-1,0), # add horizontal lane after second to last column
        comment = FALSE,
        table.placement = getOption("xtable.table.placement", "h!"),
        file = paste0("output/tables/tab_unadjusted_", "follow_up", ".tex"))

### Adjusted ----
regressions_follow_up_adj %>%
  filter(term == "treatmentMedia Literacy") %>%
  filter(is.na(subidx) & secondary_outcome == FALSE & mechanism == FALSE) %>% # omit questions that are used in a subindex
  mutate(label = if_else(str_detect(label, "Index"), "Index", label),
         label = as_factor(label),
         idx = as_factor(idx),
         is.idx = if_else(label == "Index", TRUE, FALSE),
         significance = ifelse(p.value < 0.05, "YES", "NO")) %>%
  ggplot(aes(x = estimate, y = fct_rev(label), color = is.idx, linetype = significance)) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_point() +
  geom_errorbar(aes(xmin = lower, xmax = upper), width = 0) +
  facet_grid(rows = "idx", scales = "free_y", space = "free_y", switch = "y") + # facet by index
  xlab("Estimated effect of treatment in control-group SDs (Positive expected)") +
  scale_y_discrete(position = "right") +
  scale_color_manual(values = c("black", "red")) +
  scale_linetype_manual(values=c("dotted", "solid")) +
  theme_bw() %+replace%
  theme(axis.title.y = element_blank(),
        strip.text.y.left = element_text(angle = 0),
        legend.position = "none")
ggsave("output/figures/effects_follow_up_adj.pdf", height = 3, width = 8)

## Parents ----
### Create figure ----
regressions_guardian %>%
  #filter(!idx == "Source Discernment" & !idx == "Political Discernment") %>%
  filter(is.na(subidx) & secondary_outcome == FALSE & mechanism == FALSE & follow == TRUE) %>% # omit questions that are used in a subindex
  mutate(label = if_else(str_detect(label, "Index"), "Index", label),
         label = as_factor(label),
         idx = as_factor(idx),
         follow_fact = as_factor(
           case_when(
             follow == TRUE ~ "Follow-Up",
             follow == FALSE ~ "Endline")
         ),
         is.idx = if_else(label == "Index", TRUE, FALSE),
         significance = ifelse(p.value < 0.05, "YES", "NO")) %>%
  ggplot(aes(x = estimate, y = fct_rev(label), color = is.idx, linetype = significance)) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_point() +
  geom_errorbar(aes(xmin = lower, xmax = upper), width = 0) +
  #facet_grid(rows = "idx", scales = "free_y", space = "free_y", switch = "y") + # facet by index
  facet_grid(idx ~ follow_fact, scales = "free_y", space = "free_y", switch = "y") + # facet by index
  xlab("Estimated effect of treatment in control-group SDs (Positive expected)") +
  scale_y_discrete(position = "right") +
  scale_color_manual(values = c("black", "red")) +
  scale_linetype_manual(values=c("dotted", "solid")) +
  theme_bw() %+replace%
  theme(axis.title.y = element_blank(),
        strip.text.y.left = element_text(angle = 0),
        legend.position = "none")
ggsave("output/figures/effects_guardian_follow.pdf", height = 2, width = 8)

### Create table ----
regressions_guardian %>%
  filter(dv %in% c("follow_guardian_accuracy_discernment",
                   "follow_guardian_accuracy_false",
                   "follow_guardian_accuracy_true")) %>% # filter to accuracy discernment outcomes since that result is the only one robust to controlling for guardian interview timing (see script 17_robust_follow_up_order.R and Appendix I.2)
  mutate(label = ifelse(label == "Index", "Accuracy Discernment Index", label)) %>%
  select(label, type, n, estimate, std.error, p.value) %>%
  mutate(type = case_match(type,
                           "idx" ~ "Index", 
                           "subidx" ~ "Sub-index"),
         n = format(n, big.mark = ","),
         sig.stars = case_when(p.value < 0.001 ~ "***",
                               p.value < 0.01 ~ "**",
                               p.value < 0.05 ~ "*",
                               TRUE ~ ""),
         estimate = str_c(format(round(estimate, 2), nsmall = 2), sig.stars), # add significance stars to estimate
         std.error = round(std.error, 3),
         p.value = case_when(p.value > 0.9 ~ "$>$0.9",
                             p.value < 0.001 ~ "$<$0.001",
                             p.value < 0.01 ~ as.character(round(p.value, 3)),
                             TRUE ~ as.character(round(p.value, 2)))) %>%
  select(-sig.stars) %>%
  rename(Outcome = label,
         Type = type,
         N = n,
         Estimate = estimate,
         SE = std.error,
         "p-value" = p.value) %>%
  xtable::xtable(caption = "Effect of assignment to BIMLI on treatment group parents", 
                 align = "lllllll", digits = 3,
                 label = paste0("tab:", "unadjusted_follow_up_guardian")) %>%
  print(include.rownames = FALSE, 
        caption.placement = "top",
        sanitize.text.function = identity, # add backslashes to special characters
        sanitize.colnames.function = \(x) str_c("\\textbf{", x, "}"),
        add.to.row = list(pos = list(nrow(.)), 
                          command = paste(
                            "\\hline\n\\multicolumn{6}{l}{\\footnotesize *p$<$0.05; **p$<$0.01; ***p$<$0.001. Models include library-spillover strata FEs.} \\\\\n")),
        hline.after = c(-1,0), # add horizontal lane after second to last column
        comment = FALSE,
        table.placement = getOption("xtable.table.placement", "h!"),
        file = paste0("output/tables/tab_unadjusted_", "follow_up_guardian", ".tex"))

# END of 09_regressions_main.R ----









